In [1]:

    
%matplotlib inline
import os
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts

from sklearn.linear_model import Ridge
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor


from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse









    



//anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Load and Merge Datasets



In [2]:

    
#Load the sensors dataset
sensor = pd.read_csv('sensor_updated.csv')
sensor.info()
sensor.head()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70989 entries, 0 to 70988
Data columns (total 7 columns):
datetime             70989 non-null object
temperature          3426 non-null float64
humidity             3426 non-null float64
co2                  3426 non-null float64
light                3426 non-null float64
noise                3426 non-null float64
bluetooth_devices    3426 non-null float64
dtypes: float64(6), object(1)
memory usage: 3.8+ MB






    Out[2]:






  
    
      
      datetime
      temperature
      humidity
      co2
      light
      noise
      bluetooth_devices
    
  
  
    
      0
      2017-03-25 09:05:00
      22.60
      36.900000
      781.000000
      430.0
      511.000000
      1.000000
    
    
      1
      2017-03-25 09:06:00
      23.80
      38.950000
      765.900000
      426.9
      502.000000
      11.400000
    
    
      2
      2017-03-25 09:07:00
      23.85
      38.910000
      768.300000
      422.4
      510.400000
      19.600000
    
    
      3
      2017-03-25 09:08:00
      23.90
      38.772727
      777.454545
      424.0
      506.909091
      29.727273
    
    
      4
      2017-03-25 09:09:00
      23.91
      38.730000
      770.800000
      438.1
      500.700000
      35.900000



In [3]:

    
#Load the occupancy dataset
occupancy = pd.read_csv('image_variations.csv')
occupancy.info()
occupancy.head()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3551 entries, 0 to 3550
Data columns (total 4 columns):
datetime         3551 non-null object
control_F_rms    3551 non-null float64
control_L_rms    3551 non-null float64
rolling_rms      3551 non-null float64
dtypes: float64(3), object(1)
memory usage: 111.0+ KB






    Out[3]:






  
    
      
      datetime
      control_F_rms
      control_L_rms
      rolling_rms
    
  
  
    
      0
      2017-03-25 09:11:00
      0.000000
      68.764028
      0.000000
    
    
      1
      2017-03-25 09:12:00
      15.242697
      69.110523
      15.242697
    
    
      2
      2017-03-25 09:13:00
      15.526992
      69.169608
      15.087697
    
    
      3
      2017-03-25 09:14:00
      18.106792
      69.253149
      15.422978
    
    
      4
      2017-03-25 09:15:00
      19.040465
      69.159929
      14.799398



In [4]:

    
#Merge the two datasets by datetime
df = pd.merge(sensor, occupancy[['datetime','rolling_rms']], on='datetime', how='inner')
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 3551 entries, 0 to 3550
Data columns (total 8 columns):
datetime             3551 non-null object
temperature          3379 non-null float64
humidity             3379 non-null float64
co2                  3379 non-null float64
light                3379 non-null float64
noise                3379 non-null float64
bluetooth_devices    3379 non-null float64
rolling_rms          3551 non-null float64
dtypes: float64(7), object(1)
memory usage: 249.7+ KB



In [5]:

    
#Drop rows with any NaN values
df = df[pd.notnull(df['temperature'])]



In [6]:

    
#Round the rolling_rms feature to integer
df.rolling_rms = df.rolling_rms.round()



In [7]:

    
#Drop the datetime feature
df = df.drop('datetime', 1)



In [8]:

    
np.where(np.isnan(df))









    Out[8]:





(array([], dtype=int64), array([], dtype=int64))



In [9]:

    
pd.scatter_matrix(df, alpha=0.2, figsize=(18,18), diagonal='kde')
plt.show()

Models -- Regression

note to self: Models predicting the number of occupancy, or the category? normalization or not?



In [10]:

    
df_features = df.ix[:,0:-1]
df_labels = df.ix[:,-1]



In [11]:

    
splits = cv.train_test_split(df_features, df_labels, test_size=0.2)
X_train, X_test, y_train, y_test = splits



In [12]:

    
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Ridge Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))









    



Ridge Regression model
Mean Squared Error: 27.593
Coefficient of Determination: 0.106



In [13]:

    
model = RandomForestRegressor()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Random Forest model")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print("R2 score = %0.3f" % r2_score(expected, predicted))









    



Random Forest model
Mean squared error = 18.246
R2 score = 0.409

Models -- Classification



In [14]:

    
import time
from sklearn import metrics
from sklearn import cross_validation
from sklearn.cross_validation import KFold

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier



In [15]:

    
df.describe()









    Out[15]:






  
    
      
      temperature
      humidity
      co2
      light
      noise
      bluetooth_devices
      rolling_rms
    
  
  
    
      count
      3379.000000
      3379.000000
      3379.000000
      3379.000000
      3379.000000
      3379.000000
      3379.000000
    
    
      mean
      23.125713
      39.380875
      1190.216754
      470.305878
      328.453681
      242.665067
      12.732169
    
    
      std
      1.546089
      7.176134
      164.681403
      575.067659
      176.818161
      148.146838
      5.569003
    
    
      min
      21.000000
      21.190909
      659.833333
      143.000000
      58.000000
      0.000000
      0.000000
    
    
      25%
      22.200000
      37.489444
      1074.477273
      183.125000
      143.916667
      126.000000
      10.000000
    
    
      50%
      22.866667
      39.180000
      1229.416667
      222.625000
      461.750000
      207.416667
      11.000000
    
    
      75%
      23.191667
      45.487500
      1300.625000
      443.550000
      503.900000
      352.541667
      15.000000
    
    
      max
      29.350000
      50.730000
      1724.200000
      2891.583333
      574.000000
      634.833333
      65.000000



In [16]:

    
def occupancy(c):
  if c['rolling_rms'] < 10:
    return '1'
  elif c['rolling_rms'] > 20:
    return '3'
  else:
    return '2'

df['occupancy'] = df.apply(occupancy, axis=1)
df = df.drop('rolling_rms', 1)



In [17]:

    
data   = df.iloc[:, 0:-1]
target = df.iloc[:, -1]



In [18]:

    
model = SVC()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
accuracy = metrics.accuracy_score(expected, predicted)

print("SVM Classifier")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print(accuracy)









    



SVM Classifier
Mean squared error = 38.938
0.137573964497



In [19]:

    
model = KNeighborsClassifier()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
accuracy = metrics.accuracy_score(expected, predicted)

print("K Neighbors Classifier")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print(accuracy)









    



K Neighbors Classifier
Mean squared error = 30.766
0.152366863905



In [20]:

    
model = RandomForestClassifier()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)
accuracy = metrics.accuracy_score(expected, predicted)

print("Random Forest Classifierr")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print(accuracy)









    



Random Forest Classifierr
Mean squared error = 25.238
0.181952662722



In [ ]:

	datetime	temperature	humidity	co2	light	noise	bluetooth_devices
0	2017-03-25 09:05:00	22.60	36.900000	781.000000	430.0	511.000000	1.000000
1	2017-03-25 09:06:00	23.80	38.950000	765.900000	426.9	502.000000	11.400000
2	2017-03-25 09:07:00	23.85	38.910000	768.300000	422.4	510.400000	19.600000
3	2017-03-25 09:08:00	23.90	38.772727	777.454545	424.0	506.909091	29.727273
4	2017-03-25 09:09:00	23.91	38.730000	770.800000	438.1	500.700000	35.900000

	datetime	control_F_rms	control_L_rms	rolling_rms
0	2017-03-25 09:11:00	0.000000	68.764028	0.000000
1	2017-03-25 09:12:00	15.242697	69.110523	15.242697
2	2017-03-25 09:13:00	15.526992	69.169608	15.087697
3	2017-03-25 09:14:00	18.106792	69.253149	15.422978
4	2017-03-25 09:15:00	19.040465	69.159929	14.799398

	temperature	humidity	co2	light	noise	bluetooth_devices	rolling_rms
count	3379.000000	3379.000000	3379.000000	3379.000000	3379.000000	3379.000000	3379.000000
mean	23.125713	39.380875	1190.216754	470.305878	328.453681	242.665067	12.732169
std	1.546089	7.176134	164.681403	575.067659	176.818161	148.146838	5.569003
min	21.000000	21.190909	659.833333	143.000000	58.000000	0.000000	0.000000
25%	22.200000	37.489444	1074.477273	183.125000	143.916667	126.000000	10.000000
50%	22.866667	39.180000	1229.416667	222.625000	461.750000	207.416667	11.000000
75%	23.191667	45.487500	1300.625000	443.550000	503.900000	352.541667	15.000000
max	29.350000	50.730000	1724.200000	2891.583333	574.000000	634.833333	65.000000